library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.3
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.0.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#library(nlme)
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(ggplot2)
library(dplyr)
library(sjPlot) #for plotting lmer and glmer mods
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
library(groupdata2)
library(hydroGOF) # rmse()
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
mydata <- read_csv("covid.csv") 
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   name = col_character(),
##   state = col_character(),
##   region = col_character(),
##   govParty = col_character(),
##   metro_area = col_character(),
##   main_econ = col_character()
## )
## See spec(...) for full column specifications.
mydata <- mydata %>%
  select(-c(1:2,29:31))
mydata <- mydata[is.finite(mydata$covid_death_rate_log),]
mydata$name <- as.factor(mydata$name)
mydata$region <- as.factor(mydata$region)
mydata$state <- as.factor(mydata$state)
mydata$govParty <- as.factor(mydata$govParty)
mydata$metro_area <- as.factor(mydata$metro_area)
mydata$main_econ <- as.factor(mydata$main_econ)
mydata$GQ_ESTIMATES_2019 <- log(mydata$GQ_ESTIMATES_2019) 
mydata <- mydata %>%
  filter(GQ_ESTIMATES_2019 != '-Inf') %>%
  mutate(density = total_population/area_sqmi)
head(mydata)
#dim(mydata)
#death_rate <- exp(mydata$covid_death_rate_log)*100
#summary(death_rate)
summary(mydata)
summary(mydata$name)

\(name\): county demographic factors: \(total\_population\), \(percent\_fair\_or\_poor\_health\), \(per\_capita\_income\) social factors: \(percent\_uninsured\), \(percent\_minorities\), \(percent\_below\_poverty\), \(percentile\_rank\_social\_vulnerability\) geographical factors: \(region\) and \(state\) economic factors: \(main\_econ\) and \(metro\_area\) political factors: \(pro\_Trump\) and \(govParty\)

\(GQ\_ESTIMATES\_2019\): the total number of county residents living in group quarters in 2019 (e.g. nursing homes, residential treatment centers, student housing, religious convents/monasteries, correctional facilities, and hospitals)

ggplot(mydata, aes(x=covid_death_rate_log)) +
  geom_histogram(fill = "cornflowerblue", color = "white") +
  facet_wrap(~region, ncol = 4) +
  labs(title = "Death Rates by Region")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata, aes(x=covid_death_rate_log)) +
  geom_histogram(fill = "cornflowerblue", color = "white") +
  facet_wrap(~metro_area, ncol = 3) +
  labs(title = "Death Rates by Metro Area")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata, aes(x=covid_death_rate_log)) +
  geom_histogram(fill = "cornflowerblue", color = "white") +
  facet_wrap(~govParty, ncol = 2) +
  labs(title = "Death Rates by govParty")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

EDA

ggplot(mydata, aes(x = region, y = covid_death_rate_log)) +
    geom_boxplot(notch = TRUE, fill = "cornflowerblue", alpha = .7) + 
    labs(title = "Death Rates by Region")

ggplot(mydata, aes(x = metro_area, y = covid_death_rate_log)) +
    geom_boxplot(notch = TRUE, fill = "cornflowerblue", alpha = .7) + 
    labs(title = "Death Rates by Metro Area")

ggplot(mydata, aes(x = govParty, y = covid_death_rate_log)) +
    geom_boxplot(notch = TRUE, fill = "cornflowerblue", alpha = .7) + 
    labs(title = "Death Rates by govParty")

ggplot(mydata, aes(x = main_econ, y = covid_death_rate_log)) +
    geom_boxplot(notch = TRUE, fill = "cornflowerblue", alpha = .7) + 
    labs(title = "Death Rates by Main Economy")

Northeast seems to have higher log(death rates). metro_area, govParty, main_econ do not seem to have significant impacts on death rates.

ggplot(mydata, aes(x = GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
    geom_point(color= "steelblue") +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(mydata, aes(x = per_capita_income, y = covid_death_rate_log)) +
    geom_point(color= "steelblue") +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(mydata, aes(x = proTrump, y = covid_death_rate_log)) +
    geom_point(color= "steelblue") +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(mydata, aes(x = percent_below_poverty, y = covid_death_rate_log)) +
    geom_point(color= "steelblue") +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(mydata, aes(x = percent_fair_or_poor_health, y = covid_death_rate_log)) +
    geom_point(color= "steelblue") +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(mydata, aes(x = percent_minorities, y = covid_death_rate_log)) +
    geom_point(color= "steelblue") +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(mydata, aes(x = percent_uninsured, y = covid_death_rate_log)) +
    geom_point(color= "steelblue") +
    geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

ggplot(mydata, aes(x = percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
    geom_point(color= "steelblue") +
    geom_smooth(method = "lm") 
## `geom_smooth()` using formula 'y ~ x'

Positive association between group quarters and log(death rates) Negative association between proTrump and log(death rates). Positive association between percent_below_poverty and log(death rates) Positive association between percent_fair_or_poor_health and log(death rates) Positive association between percent_minorities and log(death rates) Positive association between percentile_rank_social_vulnerability and log(death rates) No obvious relationship between log(death rates) and percent_uninsured and per capita income

# subset counties -> Columbia in various states
plotdata <- mydata %>%
    filter(name == "Columbia")
# basic Cleveland plot 
ggplot(plotdata, aes(x=name, y=covid_death_rate_log)) +
    geom_point()

# subset counties -> Franklin in various states
plotdata <- mydata %>%
    filter(name == "Franklin")
# basic Cleveland plot 
ggplot(plotdata, aes(x=name, y=covid_death_rate_log)) +
    geom_point()
# plot death rates by group quarters, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~region) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~metro_area) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~main_econ) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=GQ_ESTIMATES_2019, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~govParty) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")
# plot death rates by percent below poverty, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percent_below_poverty, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~region) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_below_poverty, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~metro_area) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_below_poverty, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~main_econ) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_below_poverty, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~govParty) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")
# plot death rates by proTrump, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=proTrump, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~region) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=proTrump, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~metro_area) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=proTrump, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~main_econ) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=proTrump, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~govParty) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")
# plot death rates by percent_fair_or_poor_health, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percent_fair_or_poor_health, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~region) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_fair_or_poor_health, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~metro_area) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_fair_or_poor_health, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~main_econ) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_fair_or_poor_health, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~govParty) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")
# plot death rates by percent_minorities, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percent_minorities, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~region) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_minorities, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~metro_area) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_minorities, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~main_econ) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_minorities, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~govParty) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")
# plot death rates by percentile_rank_social_vulnerability, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~region) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~metro_area) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~main_econ) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percentile_rank_social_vulnerability, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~govParty) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")
# plot death rates by percent_uninsured, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=percent_uninsured, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~region) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_uninsured, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~metro_area) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_uninsured, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~main_econ) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=percent_uninsured, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~govParty) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")
# plot death rates by per capita income, for each region, metro_area, main_econ, govParty
ggplot(mydata, aes(x=per_capita_income, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~region) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=per_capita_income, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~metro_area) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=per_capita_income, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~main_econ) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

ggplot(mydata, aes(x=per_capita_income, y = covid_death_rate_log)) +
   geom_smooth(se=FALSE, method="lm", color="grey") +
   geom_point(color="blue") +
   facet_wrap(~govParty) + 
   theme_minimal(base_size = 9) +
   theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
   labs(title = "Changes in Death Rates")

Linear Mixed Model

dotchart(sort(xtabs(~ mydata$name)), cex=0.7) 
## Warning in dotchart(sort(xtabs(~mydata$name)), cex = 0.7): 'x' is neither a
## vector nor a matrix: using as.numeric(x)

dotchart(sort(xtabs(~ mydata$state)), cex=0.7) 
## Warning in dotchart(sort(xtabs(~mydata$state)), cex = 0.7): 'x' is neither a
## vector nor a matrix: using as.numeric(x)

dotchart(sort(xtabs(~ mydata$region)), cex=0.7)
## Warning in dotchart(sort(xtabs(~mydata$region)), cex = 0.7): 'x' is neither a
## vector nor a matrix: using as.numeric(x)

Distribution of potential random effects variables. There are many \(name\) (county), each only contribute one instances). There's many states, with a pretty skewed distribution -> the state effects can be worrying. Finally, there's only few 4 regions.

model1 <- lmer(covid_death_rate_log ~ (1|name) + (1|state) + (1|region), data=mydata)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.0173038 (tol = 0.002, component 1)
summary(model1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: covid_death_rate_log ~ (1 | name) + (1 | state) + (1 | region)
##    Data: mydata
## 
## REML criterion at convergence: 7666.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.6748 -0.4441  0.0924  0.5982  3.8124 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.
##  name     (Intercept) 0.0007534 0.02745 
##  state    (Intercept) 0.2034087 0.45101 
##  region   (Intercept) 0.2351881 0.48496 
##  Residual             0.8376222 0.91522 
## Number of obs: 2832, groups:  name, 1682; state, 49; region, 4
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  -4.0512     0.2536  -15.98
## convergence code: 0
## Model failed to converge with max|grad| = 0.0173038 (tol = 0.002, component 1)

The estimated variances for the random effects. For \(name\), the optimal variance is 0.00123, which is very close to zero: the model gains nothing by attributing the response to county variation. So \(name\) isn’t a significant effect in the model. Hence we can drop the random effect for \(name\). But for \(state\) and \(region\), the choice of state and region introduce a larger variance.

model2 <- lmer(covid_death_rate_log ~ (1|state) + (1|region), data=mydata)
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: covid_death_rate_log ~ (1 | state) + (1 | region)
##    Data: mydata
## 
## REML criterion at convergence: 7666.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.6754 -0.4459  0.0921  0.5980  3.8140 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.2034   0.4510  
##  region   (Intercept) 0.2327   0.4823  
##  Residual             0.8384   0.9156  
## Number of obs: 2832, groups:  state, 49; region, 4
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  -4.0513     0.2523  -16.06

Dropping random effect for \(name\) makes almost no difference to the model fit.

model3 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + percentile_rank_social_vulnerability + govParty + main_econ + metro_area + (1|state) + (1|region), data=mydata)

model4 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + percentile_rank_social_vulnerability + govParty + main_econ + (1|state) + (1|region), data=mydata)

model5 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + percentile_rank_social_vulnerability + govParty + metro_area + (1|state) + (1|region), data=mydata)

anova(model3, model4)
anova(model3, model5)

metro_area and main_econ significant

model6 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)

anova(model3, model6)

govParty not significant

model7 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percent_uninsured + main_econ + metro_area + (1|state) + (1|region), data=mydata)

anova(model6, model7)

percentile_rank_social_vulnerability significant at 0.05 threshold

model8 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)

anova(model6, model8)

percent_uninsured insignificant

model9 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_fair_or_poor_health + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)

model10 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump + percent_below_poverty + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)

anova(model8, model9)
anova(model8, model10)

percent_minorities significant at 0.05 threshold percent_fair_or_poor_health insignificant

model11 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + proTrump +  percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)

model12 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)

model13 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)

model14 <- lmer(covid_death_rate_log ~ per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)

anova(model10, model11)
anova(model11, model12)
anova(model12, model13)
anova(model13, model14)

percent_below_poverty and proTrump insignificant per_capita_income and GQ_ESTIMATES_2019 very significant

model15 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region), data=mydata)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
summary(model15)
## Linear mixed model fit by REML ['lmerMod']
## Formula: covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income +  
##     percent_minorities + percentile_rank_social_vulnerability +  
##     main_econ + metro_area + (1 | state) + (1 | region)
##    Data: mydata
## 
## REML criterion at convergence: 7632.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.2222 -0.4600  0.0812  0.5804  4.0108 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.1430   0.3782  
##  region   (Intercept) 0.1988   0.4459  
##  Residual             0.8146   0.9025  
## Number of obs: 2832, groups:  state, 49; region, 4
## 
## Fixed effects:
##                                        Estimate Std. Error t value
## (Intercept)                          -4.785e+00  2.914e-01 -16.419
## GQ_ESTIMATES_2019                    -3.329e-02  1.431e-02  -2.327
## per_capita_income                     2.348e-05  5.243e-06   4.478
## percent_minorities                    4.725e-03  1.440e-03   3.281
## percentile_rank_social_vulnerability  4.197e-01  1.321e-01   3.177
## main_econgovernment                  -8.718e-02  7.949e-02  -1.097
## main_econmanufacturing                7.835e-02  7.291e-02   1.075
## main_econmining                       5.928e-02  8.895e-02   0.666
## main_econnonspecialized               1.972e-01  6.621e-02   2.978
## main_econrecreation                   1.119e-01  8.161e-02   1.371
## metro_areametro                      -2.048e-02  4.657e-02  -0.440
## metro_areanon-metro                  -2.021e-01  4.738e-02  -4.264
## 
## Correlation of Fixed Effects:
##             (Intr) GQ_EST pr_cp_ prcnt_ prc___ mn_cng mn_cnmnf mn_cnmnn mn_cnn
## GQ_ESTIMATE -0.115                                                            
## per_cpt_ncm -0.502 -0.232                                                     
## prcnt_mnrts  0.116 -0.042 -0.202                                              
## prcntl_rn__ -0.395 -0.229  0.666 -0.517                                       
## mn_cngvrnmn -0.065 -0.342  0.051 -0.034 -0.002                                
## mn_cnmnfctr -0.078 -0.162 -0.033  0.044 -0.075  0.609                         
## main_cnmnng -0.040 -0.096 -0.074  0.029 -0.073  0.458  0.458                  
## mn_cnnnspcl -0.059 -0.265 -0.043  0.040 -0.076  0.717  0.728    0.527         
## man_cnrcrtn -0.076 -0.105 -0.092  0.068 -0.019  0.532  0.553    0.421    0.632
## metro_armtr  0.064 -0.236 -0.171 -0.135  0.145  0.015  0.023    0.029   -0.048
## mtr_rnn-mtr -0.107  0.089  0.002 -0.028  0.016  0.028  0.083   -0.028    0.049
##             mn_cnr mtr_rm
## GQ_ESTIMATE              
## per_cpt_ncm              
## prcnt_mnrts              
## prcntl_rn__              
## mn_cngvrnmn              
## mn_cnmnfctr              
## main_cnmnng              
## mn_cnnnspcl              
## man_cnrcrtn              
## metro_armtr  0.051       
## mtr_rnn-mtr  0.008  0.348
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
model16 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region) + per_capita_income*metro_area, data=mydata)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
model17 <- lmer(covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income + percent_minorities + percentile_rank_social_vulnerability + main_econ + metro_area + (1|state) + (1|region) + per_capita_income*metro_area + percent_minorities*per_capita_income, data=mydata)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
anova(model16, model15)
## refitting model(s) with ML (instead of REML)
anova(model17, model16)
## refitting model(s) with ML (instead of REML)
final.model <- model17
summary(final.model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income +  
##     percent_minorities + percentile_rank_social_vulnerability +  
##     main_econ + metro_area + (1 | state) + (1 | region) + per_capita_income *  
##     metro_area + percent_minorities * per_capita_income
##    Data: mydata
## 
## REML criterion at convergence: 7693.4
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.2456 -0.4479  0.0794  0.5721  4.0786 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.1387   0.3725  
##  region   (Intercept) 0.1742   0.4174  
##  Residual             0.8124   0.9013  
## Number of obs: 2832, groups:  state, 49; region, 4
## 
## Fixed effects:
##                                         Estimate Std. Error t value
## (Intercept)                           -5.054e+00  3.814e-01 -13.250
## GQ_ESTIMATES_2019                     -3.157e-02  1.443e-02  -2.188
## per_capita_income                      3.360e-05  1.127e-05   2.981
## percent_minorities                     1.142e-02  3.736e-03   3.057
## percentile_rank_social_vulnerability   4.971e-01  1.380e-01   3.601
## main_econgovernment                   -1.002e-01  7.948e-02  -1.261
## main_econmanufacturing                 7.343e-02  7.283e-02   1.008
## main_econmining                        7.092e-02  8.891e-02   0.798
## main_econnonspecialized                1.842e-01  6.623e-02   2.781
## main_econrecreation                    1.089e-01  8.216e-02   1.325
## metro_areametro                       -2.380e-01  2.143e-01  -1.110
## metro_areanon-metro                    1.536e-01  2.306e-01   0.666
## per_capita_income:metro_areametro      8.657e-06  8.933e-06   0.969
## per_capita_income:metro_areanon-metro -1.536e-05  9.819e-06  -1.564
## per_capita_income:percent_minorities  -3.538e-07  1.693e-07  -2.090
## 
## Correlation matrix not shown by default, as p = 15 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
#print(final.model, corr=F)

covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income +
percent_minorities + percentile_rank_social_vulnerability +
main_econ + metro_area + (1 | state) + (1 | region) + per_capita_income *
metro_area + percent_minorities * per_capita_income

Plots

sjPlot::tab_model(final.model, show.re.var= TRUE, 
                  #pred.labels =c("(Intercept)", "per capita income:percent minorities"...),
                  dv.labels= "Effect of Socio-economic Factors on County COVID Death Rates")
  Effect of Socio-economic Factors on County COVID Death Rates
Predictors Estimates CI p
(Intercept) -5.05 -5.80 – -4.31 <0.001
GQ_ESTIMATES_2019 -0.03 -0.06 – -0.00 0.029
per_capita_income 0.00 0.00 – 0.00 0.003
percent_minorities 0.01 0.00 – 0.02 0.002
percentile_rank_social_vulnerability 0.50 0.23 – 0.77 <0.001
main_econ [government] -0.10 -0.26 – 0.06 0.207
main_econ [manufacturing] 0.07 -0.07 – 0.22 0.313
main_econ [mining] 0.07 -0.10 – 0.25 0.425
main_econ
[nonspecialized]
0.18 0.05 – 0.31 0.005
main_econ [recreation] 0.11 -0.05 – 0.27 0.185
metro_area [metro] -0.24 -0.66 – 0.18 0.267
metro_area [non-metro] 0.15 -0.30 – 0.61 0.505
per_capita_income *
metro_area [metro]
0.00 -0.00 – 0.00 0.332
per_capita_income *
metro_area [non-metro]
-0.00 -0.00 – 0.00 0.118
per_capita_income *
percent_minorities
-0.00 -0.00 – -0.00 0.037
Random Effects
σ2 0.81
τ00 state 0.14
τ00 region 0.17
ICC 0.28
N state 49
N region 4
Observations 2832
Marginal R2 / Conditional R2 0.041 / 0.308

Effects are pretty much all significant: GQ_ESTIMATES_2019, per_capita_income, percent_minorities, percentile_rank_social_vulnerability (positive), main_econ [nonspecialized] (baseline: farming), per_capita_income*percent_minorities

#plot for fixed effects
#axis labels should be in order from bottom to top
#To see the values of the effect size and p-value, set show.values and show.p= TRUE. 
#P-values will only be shown if the effect size values are too
sjPlot::plot_model(final.model, 
                   #axis.labels=c("per capita income:percent minorities"...),
                   show.values=TRUE, show.p=TRUE,
                   title="Effect of Socio-economic Factors on County COVID Death Rates")

Model Diagnostics

plot(final.model, type=c("p", smooth))

qqnorm(residuals(final.model))

ggplot(data.frame(lev=hatvalues(final.model), 
       pearson=residuals(final.model, type="pearson")), 
       aes(x=lev, y=pearson)) +
       geom_point() +
       theme_bw()

Residaul plot looks good and no obvious heteroskedasticity. Normality is also good. Seems to have some high leverage points.

# scale-location plots
plot(final.model, sqrt(abs(resid(.))) ~ fitted(.), type = c("p", "smooth"))

CV

#creating a training set
smp_size <- floor(0.8 * nrow(mydata))

set.seed(1)
train_ind <- sample(seq_len(nrow(mydata)), size = smp_size)

train <- mydata[train_ind, ]
test <- mydata[-train_ind, ]
#obtaining the mean-squared error for training set
pred.tr <- predict(final.model, train)
MSE.tr <- sum((pred.tr - train$covid_death_rate_log)^2)/nrow(train)
MSE.tr
## [1] 0.7978782
#obtaining the mean-squared error for test set
pred.te <- predict(final.model, test)
MSE.te <- sum((pred.te - test$covid_death_rate_log)^2)/nrow(test)
MSE.te
## [1] 0.7915444
#obtaining the mean-squared error for dataset
pred <- predict(final.model, mydata)
MSE <- sum((pred - mydata$covid_death_rate_log)^2)/nrow(mydata)
MSE
## [1] 0.7966101
#https://cran.r-project.org/web/packages/groupdata2/vignettes/cross-validation_with_groupdata2.html
set.seed(1) 
train <- fold(train, k = 10)
train <- train %>% arrange(.folds) # order by .folds
crossvalidate <- function(data, k, model, dependent, random = FALSE){
  # 'data' is the training set with the ".folds" column
  # 'k' is the number of folds we have
  # 'model' is a string describing a linear regression model formula
  # 'dependent' is a string with the name of the score column we want to predict
  # 'random' is a logical; do we have random effects in the model?
  
  # Initialize empty list for recording performances
  performances <- c()
  
  # One iteration per fold
  for (fold in 1:k){
    # Create training set for this iteration
    # Subset all the datapoints where .folds does not match the current fold
    training_set <- data[data$.folds != fold,]
    
    # Create test set for this iteration
    # Subset all the datapoints where .folds matches the current fold
    testing_set <- data[data$.folds == fold,]
    
    ## Train model
    # If there is a random effect,
    # use lmer() to train model
    # else use lm()
    if (isTRUE(random)){
      # Train linear mixed effects model on training set
      model <- lmer(model, training_set, REML=FALSE)
    } else {
      # Train linear model on training set
      model <- lm(model, training_set)
    }
    ## Test model
    # Predict the dependent variable in the testing_set with the trained model
    predicted <- predict(model, testing_set, allow.new.levels=TRUE)

    # Get the Root Mean Square Error between the predicted and the observed
    RMSE <- rmse(predicted, testing_set[[dependent]])

    # Add the RMSE to the performance list
    performances[fold] <- RMSE
  }
  # Return the mean of the recorded RMSEs
  #return(performances)
  #return(c('RMSE' = mean(performances)))
  return(mean(performances))
}
#library(hydroGOF) # rmse()
set.seed(1)
RMSE.tr <- crossvalidate(train, k=10, final.model, dependent="covid_death_rate_log", random = TRUE)
## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning: Some predictor variables are on very different scales: consider
## rescaling
MSE.tr <- RMSE.tr^2
MSE.tr
## [1] 0.8341988

MSE: 0.8341988

# Creating the model for the full training set
cv.model <- lmer(final.model, train, REML = FALSE)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00292805 (tol = 0.002, component 1)
# Predict the dependent variable in the test_set with the trained model
predicted <- predict(cv.model, test, allow.new.levels=TRUE)

# Get the Root Mean Square Error between the predicted and the observed
RMSE.te <- rmse(predicted, test[["covid_death_rate_log"]])
MSE.te <- RMSE.te^2
MSE.te
## [1] 0.8273282

MSE: 0.8273282

# Summarize the results
summary(cv.model)
## Linear mixed model fit by maximum likelihood  ['lmerMod']
## Formula: covid_death_rate_log ~ GQ_ESTIMATES_2019 + per_capita_income +  
##     percent_minorities + percentile_rank_social_vulnerability +  
##     main_econ + metro_area + (1 | state) + (1 | region) + per_capita_income *  
##     metro_area + percent_minorities * per_capita_income
##    Data: train
## 
##      AIC      BIC   logLik deviance df.resid 
##   6087.3   6190.4  -3025.7   6051.3     2247 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.1585 -0.4569  0.0798  0.5680  3.9468 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  state    (Intercept) 0.1262   0.3552  
##  region   (Intercept) 0.0980   0.3131  
##  Residual             0.8102   0.9001  
## Number of obs: 2265, groups:  state, 49; region, 4
## 
## Fixed effects:
##                                         Estimate Std. Error t value
## (Intercept)                           -5.089e+00  3.887e-01 -13.093
## GQ_ESTIMATES_2019                     -2.276e-02  1.600e-02  -1.423
## per_capita_income                      3.292e-05  1.262e-05   2.608
## percent_minorities                     1.264e-02  4.253e-03   2.972
## percentile_rank_social_vulnerability   4.556e-01  1.531e-01   2.976
## main_econgovernment                   -6.087e-02  8.968e-02  -0.679
## main_econmanufacturing                 5.506e-02  8.184e-02   0.673
## main_econmining                        6.630e-02  9.929e-02   0.668
## main_econnonspecialized                1.891e-01  7.438e-02   2.542
## main_econrecreation                    1.373e-01  9.162e-02   1.499
## metro_areametro                       -2.025e-01  2.383e-01  -0.850
## metro_areanon-metro                    1.110e-01  2.612e-01   0.425
## per_capita_income:metro_areametro      7.980e-06  9.931e-06   0.804
## per_capita_income:metro_areanon-metro -1.323e-05  1.109e-05  -1.193
## per_capita_income:percent_minorities  -4.110e-07  1.931e-07  -2.129
## 
## Correlation matrix not shown by default, as p = 15 > 12.
## Use print(x, correlation=TRUE)  or
##     vcov(x)        if you need it
## fit warnings:
## Some predictor variables are on very different scales: consider rescaling
## convergence code: 0
## Model failed to converge with max|grad| = 0.00292805 (tol = 0.002, component 1)
sjPlot::tab_model(cv.model, show.re.var= TRUE, 
                  #pred.labels =c("(Intercept)", "per capita income:percent minorities"...),
                  dv.labels= "Effect of Socio-economic Factors on County COVID Death Rates")
  Effect of Socio-economic Factors on County COVID Death Rates
Predictors Estimates CI p
(Intercept) -5.09 -5.85 – -4.33 <0.001
GQ_ESTIMATES_2019 -0.02 -0.05 – 0.01 0.155
per_capita_income 0.00 0.00 – 0.00 0.009
percent_minorities 0.01 0.00 – 0.02 0.003
percentile_rank_social_vulnerability 0.46 0.16 – 0.76 0.003
main_econ [government] -0.06 -0.24 – 0.11 0.497
main_econ [manufacturing] 0.06 -0.11 – 0.22 0.501
main_econ [mining] 0.07 -0.13 – 0.26 0.504
main_econ
[nonspecialized]
0.19 0.04 – 0.33 0.011
main_econ [recreation] 0.14 -0.04 – 0.32 0.134
metro_area [metro] -0.20 -0.67 – 0.26 0.395
metro_area [non-metro] 0.11 -0.40 – 0.62 0.671
per_capita_income *
metro_area [metro]
0.00 -0.00 – 0.00 0.422
per_capita_income *
metro_area [non-metro]
-0.00 -0.00 – 0.00 0.233
per_capita_income *
percent_minorities
-0.00 -0.00 – -0.00 0.033
Random Effects
σ2 0.81
τ00 state 0.13
τ00 region 0.10
ICC 0.22
N state 49
N region 4
Observations 2265
Marginal R2 / Conditional R2 0.042 / 0.250

GQ_ESTIMATES_2019 no longer significant

# Predict the dependent variable in the full data with the trained model
predicted <- predict(cv.model, mydata, allow.new.levels=TRUE)

# Get the Root Mean Square Error between the predicted and the observed
RMSE <- rmse(predicted, mydata[["covid_death_rate_log"]])
MSE <- RMSE^2
MSE
## [1] 0.8023507

MSE: 0.8023507